{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../code\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"Sub-module available for the above is sent_tokenize.\n",
    "            An obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization. \n",
    "            Imagine you need to count average words per sentence, how you will calculate? \n",
    "            For accomplishing such a task, you need both sentence tokenization as well as words to calculate the ratio. \n",
    "            Such output serves as an important feature for machine training as the answer would be numeric. \n",
    "            Check the below example to learn how sentence tokenization is different from words tokenization.\n",
    "            The taxes the President announced will not lower work incentives. Evrika!\n",
    "            John can’t keep up with Mary’s rapid mood swings.\n",
    "            \"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import tokenize_into_sentences, filter_sentences, preprocess, UsedRoles\n",
    "from word_embedding import run_word2vec, compute_embedding, USE, SIF_Word2Vec\n",
    "from semantic_role_labeling import SRL, extract_roles, postprocess_roles\n",
    "from clustering import Clustering\n",
    "from sklearn.cluster import KMeans\n",
    "from cooccurrence import build_df, CoOccurrence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "used_roles=UsedRoles()\n",
    "used_roles['ARG2']=True\n",
    "print(f\"{used_roles.used}\\n{used_roles.embeddable}\\n{used_roles.not_embeddable}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "srl = SRL(\"./srl-model-2018.05.25.tar.gz\")\n",
    "srl([\" \".join([\"What\",\"are\",\"you\",\"doing?\"])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "srl = SRL(\"./bert-base-srl-2020.03.24.tar.gz\")\n",
    "srl([\" \".join([\"What\",\"are\",\"you\",\"doing\"])])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "srl = SRL(\"./srl-model-2018.05.25.tar.gz\")\n",
    "srl([\"the government increases spending without raising taxes.\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "srl = SRL(\"./bert-base-srl-2020.03.24.tar.gz\")\n",
    "srl([\"the government increases spending without raising taxes.\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "srl = SRL(\"./srl-model-2018.05.25.tar.gz\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "use = USE('./USE-4')\n",
    "use([\"What\",\"are\",\"you\",\"doing\"]).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sif_w2v = SIF_Word2Vec(\"./nytimes_word2vec.model\")\n",
    "sif_w2v([\"what\",\"are\",\"you\",\"doing\"]).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans=KMeans(random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = tokenize_into_sentences(text)\n",
    "sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = filter_sentences(sentences, max_sentence_length=350)\n",
    "sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "srl_res = srl(sentences=sentences)\n",
    "srl_res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roles,sentence_index = extract_roles(srl_res)\n",
    "sentence_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "postproc_roles = postprocess_roles(roles)\n",
    "postproc_roles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sif_vectors, sif_statements_index, sif_funny_index =compute_embedding(sif_w2v,statements=postproc_roles,\n",
    "                                                                      used_roles=used_roles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sif_statements_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{el:sif_vectors[el].shape for el in sif_vectors.keys()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sif_funny_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "postproc_roles[0][\"ARG2\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "USE_vectors, USE_statements_index, USE_funny_index = compute_embedding(use,roles,used_roles)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "USE_statements_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{el:USE_vectors[el].shape for el in USE_vectors.keys()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "USE_funny_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clustering = Clustering(cluster=kmeans,n_clusters={'ARGO':2, 'ARG1': 1, 'ARG2':1, 'B-V':2},\n",
    "                         used_roles=used_roles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clustering.fit(vectors=sif_vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{el:clustering._cluster[el].labels_ for el in clustering._cluster.keys()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clustering_res = clustering.predict(vectors=sif_vectors)\n",
    "clustering_res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = clustering.label_most_similar_in_w2v(sif_w2v)\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = build_df(\n",
    "    clustering_res=clustering_res,\n",
    "    postproc_roles=postproc_roles,\n",
    "    statement_index=sif_statements_index,\n",
    "    used_roles=used_roles,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc = CoOccurrence(df, labels, used_roles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.subset=None ## by convention None means take all roles\n",
    "cooc.subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.narratives_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.narratives_pmi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "cooc.subset={\"ARGO\",\"ARG1\",\"B-V\",\"B-ARGM-MOD\",\"B-ARGM-NEG\"}\n",
    "print(cooc.normal_order)\n",
    "cooc.display_order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.narratives_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.narratives_pmi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.subset={\"ARGO\",\"ARG1\",\"B-V\"}\n",
    "cooc.narratives_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cooc.narratives_pmi"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
